# 这个是关于 bert tokenizer 的相关测试
from transformers import BertTokenizer

bert_tokenizer = BertTokenizer.from_pretrained("../bert_model/bert-base-uncased")

# 测试输入两个 text 文本的效果
if __name__ == '__main__':
    a = "I have a pen."
    b = "I have a pencil."
    print(bert_tokenizer.encode(a, b))
    # [101, 1045, 2031, 1037, 7279, 1012, 102, 1045, 2031, 1037, 14745, 1012, 102]
    token = bert_tokenizer(a, b, return_tensors='pt')
    # {
    # 'input_ids': [101, 1045, 2031, 1037, 7279, 1012, 102, 1045, 2031, 1037, 14745, 1012, 102],
    # 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
    # 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    # }
    print(type(token["attention_mask"]), token["attention_mask"].dtype, token["attention_mask"])
    # <class 'torch.Tensor'> torch.int64 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
    pass
